!pip3 install plotly
!pip install jupyter_contrib_nbextensions
!pip3 install folium
!jupyter nbextension install <url>/toc2.zip --user
!jupyter nbextension enable toc2/main
## Plotting Libaray
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import folium
from folium.plugins import HeatMap
## Pandas Dataframe Library
import pandas as pd
## Numpy Library
import numpy as np
## Train and Test Split
from sklearn.model_selection import train_test_split
## Evaluation Matrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
## Normalize
from sklearn.preprocessing import MinMaxScaler
## Models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
## Kfold and ROC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix, auc
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV,StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
df=pd.read_csv('feature_data.csv') #reading the file of data
label_df=pd.read_csv('label_data.csv') #reading the file of labels
Exploratory Analysis is term used in data analysis, which is used to explore the data and extract useful information from it.
Exploratory Analysis is carried out once you have all the data collected, cleaned and processed. You find the exact information you need to carry out the analysis or if you need more through manipulating the data.
During this step, one can use various techniques in python (such as functions and plots) to carry out analysis, which leads to the understanding of the data in order to become able to interpret it more effectively and derive better conclusions according to the requirements.
df['cancelation'] = label_df['cancelation'] #the column "cancelation" of labels = the column "cancelation" in the data file
Checking the new column:
df.head()
print("Row's number:", df.shape[0]) #the 0 axis is the rows
print("Columns's number:", df.shape[1]) #the 1 axis is the columns
Another way for checking the dimensionality of the model:
df.shape
Checking columns names in the data file:
df.columns
Exploring the type of columns:
df.dtypes
Informations about the data:
df.info()
df.describe() #describing the data
Checking how many booking cancellation were done:
df['cancelation'].value_counts()
In this pie graph we can see the percentage of bookings that were cancelled and bookings that were not:
1 indicates a cancelled booking
0 indicates a not cancelled booking
cancellation_graph = px.pie(df, values=df['cancelation'].value_counts().values, names=df['cancelation'].value_counts().index,
title='Cancelation' , color_discrete_sequence=px.colors.sequential.Peach)
cancellation_graph.update_traces(textposition='inside', textinfo='percent+label')
cancellation_graph.show()
Checking the amount of orders for every type:
df['order_type'].value_counts()
In this pie graph we see the percentages for each type of orders:
We can notice that most people order "online TA"
order_types = px.pie(df, values=df['order_type'].value_counts().values, names=df['order_type'].value_counts().index,
title='The type of Orders' ,color_discrete_sequence=px.colors.sequential.Mint
)
order_types.update_traces(textposition='inside', textinfo='percent+label')
order_types.show()
Checking which country citizens book the most:
df['country'].value_counts().head(1)
Top 10 countries:
df['country'].value_counts().head(10)
Bar plot describing the amount of citizens from countries:
fig = go.Figure(data=[go.Bar(
x=df['country'].value_counts().index[0:10], y=df['country'].value_counts().values[0:10],
text=df['country'].value_counts().values[0:10],
textposition='outside',marker_color='lightseagreen'
)])
fig.show()
Pie plot describing the amount of citizens from countries:
countries = px.pie(df, values=df['country'].value_counts().values, names=df['country'].value_counts().index,
title='Countries' ,color_discrete_sequence=px.colors.sequential.RdPu)
countries.update_traces(textposition='inside', textinfo='percent+label')
countries.show()
Creating a new feature thats give us the total number of guests: (includes adults/children/babies)
df['guest'] = df['adults'] + df['children'] + df['babies']
Showing the top 5 countries with guests that didnt cancell their booking:
#cancelation==Flase in order to fetch only the orders that were not cancelled
guest_per_country = df[df['cancelation'] == False]['country'].value_counts().reset_index()
guest_per_country.columns = ['country', 'guest'] #selecting this two columns
guest_per_country.head()
A world map showing the countries with the number of guests:
Yellow indicates the largest number of guests
map_of_countries = folium.Map()
guest_map = px.choropleth(guest_per_country, locations = guest_per_country['country'],color = guest_per_country['guest'],
hover_name = guest_per_country['country'])
guest_map.show()
Bar plot that indicates the number of orders that didnt be cancelled by each month .
We can see that August and July has most order, we notice that its summer vacation so most of the families prefer to go for a vacation
month_order = df[df['cancelation'] == False]['order_month'].value_counts().reset_index()
# Fetching only the orders that were not cancelled
graph_order_month = px.scatter(x=month_order['index'], y=month_order['order_month'])
# axis X marks the month
# And axis Y -the number of orders
graph_order_month = go.Figure(data=[go.Bar(x=month_order['index'], y=month_order['order_month'],
text=month_order['order_month'],textposition='outside',marker_color='green')])
graph_order_month.show()
month_order = df[df['cancelation'] == False]['order_month'].value_counts()
order_and_month = px.pie(df, values=month_order, names=df['order_month'].value_counts().index,
title='Orders and months' ,color_discrete_sequence=px.colors.sequential.BuGn)
order_and_month.update_traces(textposition='inside', textinfo='percent+label')
order_and_month.show()
We can see in this bar plot that the highest ADR is in August and we also can notice that most cancelations happens in the same month,we can conclude that adr is the reason of that.
plt.figure(figsize=(15,10)) #size of graph
sns.barplot(x='order_month', y='adr', hue='cancelation', palette= 'summer', data=df)
plt.title('Order Month vs ADR vs Booking Cancellation Status')
Crosstab showing the number of customer types that cancelled their booking:
We can see that people with transient cancel the most
pd.crosstab([df["cancelation"]], df["customer_type"],margins = True).style.background_gradient(cmap = "gist_gray")
Catplot that shows the cancelations/not according to the customer type:
sns.catplot(x='customer_type', col = 'cancelation', data=df, kind = 'count', palette='Set2') #countplot
Crosstab showing the number of deposit types that cancelled their booking:
We can see that most of the cancellations are of those booking where
no deposit
pd.crosstab([df["cancelation"]], df["deposit_type"],margins = True).style.background_gradient(cmap = "Oranges")
Catplot that shows the cancelations/not according to the deposite type:
sns.catplot(x="deposit_type", col = 'cancelation', data=df, kind = 'count', palette='rainbow')
adverage_of_cancellation = df.groupby(['order_year'])['cancelation'].mean()
print("Cancellation Percantage per year",adverage_of_cancellation*100)
Barplot showing the adverage of cancellation by year:
cancell_year =go.Figure(data=[go.Bar(x=adverage_of_cancellation.index, y=adverage_of_cancellation*100,text=adverage_of_cancellation*100,
textposition='outside',marker_color='grey')])
cancell_year.show()
(sns.FacetGrid(df, hue = 'cancelation',height = 6,xlim = (0,500)).map(sns.kdeplot, 'time_until_order', shade = True)
.add_legend());
#we can notice that the peak of cancellations is close to 50 days for order, also we can notice that people who order in short time tend not to cancell.
df_num = df.select_dtypes(include=np.number)
df_num.hist(figsize=(15,15))
plt.show()
#we can notice that anon_feat_10 may be normalized but the rest is not
A preliminary processing of data in order to prepare it for the primary processing or for further analysis.
#The explanations of the correlation is done with the other corellation section
## correlaion before null values'S handiling
features = df.columns ## Fetching all Features Column names
## Applying Pearson Correaltion
mask = np.zeros_like(df[features].corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
## Creating a Plot Diagram
f, ax = plt.subplots(figsize=(16, 12))
## Title of Plot
plt.title('Pearson Correlation Matrix before null handiling',fontsize=27)
sns.heatmap(df[features].corr(),linewidths=0.25,vmax=0.7,square=True,cmap="RdGy",
linecolor='w',annot=True,annot_kws={"size":8},mask=mask,cbar_kws={"shrink": .9});
df.corr().style.background_gradient(cmap='coolwarm')
#The cell with darker culur indicates a stronger corellation
Changing the type of "Cancelation" column from bool into int by changing "True" into 1 and "False" into 0:
df['cancelation'].replace({True:1,False:0},inplace=True)
df.isnull().sum() #checking how many null values for every feature
handling null values using interpolation:
Interpolation is handle both object and numeric values easily.
(Interplation is exaplained in the Report)
Two interpolation method will be used
## Liner interpolation will be applied to handle data linearly
df = df.interpolate()
## padding Interpolation applied to handle the values missed by Linear Interpolation,
#padding interpolation, specify a limit that limit is the maximum number of nans the method can fill consecutively.
df = df.interpolate(method='pad', limit=15) #‘pad’: Fill in NaNs using existing values.
df = df.replace([np.inf, -np.inf], np.nan) ## convert inifite values into Nan Values
df = df.dropna(how="any") #drop all the nan values
df.isnull().sum() #Check if there are zero values the data
Will use cat labelencoder to convert them easily:
(also exaplained in the Report)
The categorical type is a process of factorization. Meaning that each unique value or category is given a incremented integer value starting from zero.
df['country'] =df['country'].astype('category').cat.codes
df['order_type'] =df['order_type'].astype('category').cat.codes
df['acquisition_channel'] =df['acquisition_channel'].astype('category').cat.codes
df['deposit_type'] =df['deposit_type'].astype('category').cat.codes
df['customer_type'] =df['customer_type'].astype('category').cat.codes
df['order_week'] =df['order_week'].astype('category').cat.codes
Converting Data columns into one date column:
Datetime function select's specific columns like year, month and day so predefined columns name will be change:
df = df.rename(columns={'order_year': 'year', 'order_month': 'month', 'order_day_of_month': 'day'})
#We will rename the columns to use "to_datetime" that converts to numbers
Few of the rows has wrong date entry like June month is of 30 days but here there are entries for 31,So we will drop these entries:
result = df.loc[df['month'].isin(['June', 'September','November','April']),['month','day'] ] [df['day'] == 31]
result = result.index
df = df.drop(result)
#months that have 30 days
result1 = df.loc[df['month'].isin(['February']),['month','day'] ] [(df["day"] == 31) | (df["day"] == 30) | (df["day"] == 29) ]
result1 = result1.index
#months that have 28 days
df = df.drop(result1)
now the months are with the correct num of days
Creating Date Object:
df['DATE'] = pd.to_datetime(df.year.astype(str) + '/' + df.month.astype(str) + df.day.astype(str))
Model can't train on date so to handle it each date will be converted into index:
df=df.set_index(df.DATE)
df = df.drop(['Unnamed: 0','year','month','day','DATE'], axis = 1)
Using Z-score to find the outliers
df = df.astype(float) #changing values into float
from scipy import stats
outliers = np.abs(stats.zscore(df))
print(outliers)
threshold = 3 #we assume that the outlier's Value of Variable is Greater than 3
print(np.where(outliers > 3)) #printing the indices where outliers>3
Removing Outliers:
df = df[(np.abs(stats.zscore(df)) < 3).all(axis=1)]
Checking The New Dataset without Outliers:
df.shape
Apply Person Correlation to find the highly correlated features
The correlation coefficient has values between -1 to 1 —
features = df.columns ## Fetching all Features Column names
## Applying Pearson Correaltion
mask = np.zeros_like(df[features].corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
## Creating a Plot Diagram
f, ax = plt.subplots(figsize=(16, 12))
## Title of Plot
plt.title('Pearson Correlation Matrix',fontsize=27)
sns.heatmap(df[features].corr(),linewidths=0.25,vmax=0.7,square=True,cmap="OrRd",
linecolor='w',annot=True,annot_kws={"size":8},mask=mask,cbar_kws={"shrink": .9});
High Correlated Variables tends to have smiliar inforamtion which tend to bring down the performace of model so highly correlated features will be removed from the model
relevant_features = mask[mask>0.8] ## selecting features with 80% correlation
corr_matrix = df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
Following features were removed from the dataset because of their high corellation:
to_drop = [column for column in upper.columns if any(upper[column] > 0.80)]
df = df.drop(df[to_drop], axis=1)
to_drop
#splitting
x=df.drop(['cancelation'], axis = 1)
y=df.cancelation
On machine learning, the performance of a model only benefits from more features up until a certain point. The more features are fed into a model, the more the dimensionality of the data increases. As the dimensionality increases, overfitting becomes more likely.
The Dimensonality Problem usually occur when there is high number of features as they can directly effect the model prediction.
Here the dataset consist of only 35 features and does not have high dimensonality plus by using feature importance only the improtant feautres are being used to train the model hence demionsality reduction is not required for this dataset.
Since there are number of anontated features and we dont know what they actualy repsent to, we will apply feature imporatance technique to check the importance of each feature and its effect on the model training.
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(x, y)
feature_scores = pd.Series(clf.feature_importances_, index=x.columns).sort_values(ascending=False)
feature_scores
Here we can see that Babies has the least affect on the prediction of model ,but country has the most affect .
also there are many anontated features that have effect on the model prediction so those will be used and left for model training
f, ax = plt.subplots(figsize=(15, 7))
sns.barplot(x=feature_scores, y=feature_scores.index)
ax.set_title("The importance of Features")
ax.set_yticklabels(feature_scores.index)
ax.set_xlabel("Feature importance score")
ax.set_ylabel("Features")
plt.show()
So for the Model Top 20 Features are selected
## Selecting top 20 features based on the ranking
features = feature_scores.index[0:20]
x = x[features]
x
Feature Scaling or Standardization: It is a step of Data Pre Processing which is applied to independent variables or features of data. It basically helps to normalise the data within a particular range. Sometimes, it also helps in speeding up the calculations in an algorithm. So, Normalization is imporant to scale all data equally for beter results
We can see from looking at the data that its not normalized.
For scaling we are using minmax
# Get column names first
names = x.columns
# Create the Scaler object
sc = MinMaxScaler()
# Fit your data on the scaler object
x = sc.fit_transform(x)
x = pd.DataFrame(x, columns=names)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.35, random_state = 47)
print ("X_train: ", len(X_train))
print("X_test: ", len(X_test))
print("y_train: ", len(y_train))
print("y_test: ", len(y_test))
test_df = pd.read_csv ('feature_data_test.csv') #loading the data
Creating function in order to preprocess:
def test_preprocess(df,features):
df = df.interpolate()
df = df.interpolate(method='pad', limit=15)
df = df.replace([np.inf, -np.inf], np.nan) ## convert inifite values into Nan Values
df = df.dropna(how="any")
df['country'] =df['country'].astype('category').cat.codes
df['order_type'] =df['order_type'].astype('category').cat.codes
df['acquisition_channel'] =df['acquisition_channel'].astype('category').cat.codes
df['deposit_type'] =df['deposit_type'].astype('category').cat.codes
df['customer_type'] =df['customer_type'].astype('category').cat.codes
df['order_week'] =df['order_week'].astype('category').cat.codes
df = df[features]
return df
test_df = test_preprocess(test_df,features)
Method Follow three Steps:
These are functions that we called in the simple models and advanced models:
def models(alg, X_train, X_test, y_train, y_test,test_df):
model = alg
model_alg = model.fit(X_train, y_train)
global y_probablity, y_pred, test_prob #global variables in order to not be deleted at the end of function
y_probablity = model_alg.predict_proba(X_test)[:,1] #predicting probability of label
y_pred = model_alg.predict(X_test) #predicting label
test_prob = model_alg.predict_proba(test_df)[:,1] #testing the real test
train_pred = model_alg.predict(X_train)
name = type(model).__name__
nn_cm = confusion_matrix(y_test, y_pred) # Creating the confusion matrix
# Visualization:
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(nn_cm, annot=True, linewidth=0.7, linecolor='olive', fmt='.0f', ax=ax, cmap='YlGnBu')
plt.title(name)
plt.xlabel('y_pred')
plt.ylabel('y_test')
plt.show()
def Check(model): #making AUC for every model, the final AUC is the mean of k folds
tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
fig1 = plt.figure(figsize=[12,12])
cv = KFold(n_splits=5, random_state=7, shuffle=True)
i=1
for train_index, test_index in cv.split(x): #checking for every k fold
#iloc : helps us select a value that belongs to a particular row or column
X_train = x.iloc[train_index]
X_test = x.iloc[test_index]
y_train = y.iloc[train_index]
y_test = y.iloc[test_index]
model.fit(X_train, y_train) # Run Models
prediction = model.predict_proba(X_test)
fpr, tpr, t = roc_curve(y_test, prediction[:, 1]) #making ROC
tprs.append(np.interp(mean_fpr, fpr, tpr))
roc_auc = auc(fpr, tpr)
aucs.append(roc_auc)
print("Training Data Accuracy:", model.score(X_train,y_train)*100)
print("Test Data Accuracy:", model.score(X_test,y_test)*100)
plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='blue',
label=r'Mean ROC (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
plt.text(0.32,0.7,'More accurate area',fontsize = 12)
plt.text(0.63,0.4,'Less accurate area',fontsize = 12)
plt.show()
Parameters:
knn = KNeighborsClassifier(n_neighbors=3)
Check(knn)
In This Model:
models(KNeighborsClassifier(n_neighbors=3), X_train, X_test, y_train, y_test,test_df)
test_pred = pd.DataFrame() #making a DataFrame for the test predictions
test_pred['KNN Prediction'] = test_prob #Adding the predections for this specific alg
lr = LogisticRegression()
Check(lr)
In This Model:
models(LogisticRegression(), X_train, X_test, y_train, y_test,test_df)
test_pred['LR Prediction'] = test_prob #Adding the predections for this specific alg
Parameters:
bootstrap=True, ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False
rf = RandomForestClassifier()
Check(rf)
In This Model:
models(RandomForestClassifier(), X_train, X_test, y_train, y_test,test_df)
test_pred['RF Prediction'] = test_prob #Adding the predections for this specific alg
Parameters:
activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08, hidden_layer_sizes=(100,), learning_rate='constant', learning_rate_init=0.001, max_fun=15000, max_iter=200, momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5, random_state=None, shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False, warm_start=False
mlp = MLPClassifier()
Check(mlp)
**In This Model
models(MLPClassifier(), X_train, X_test, y_train, y_test,test_df)
test_pred['MLP Prediction'] = test_prob #Adding the predections for this specific alg
The Confustion Metric and The K-Fold Cross Validation is added in the Models/Advance Models Part.
If we look at the accuacry difference of training and test in every K-Fold we can detect that although there was little biase toward the training data but Overall Models are Not Overfitting Like MLP, SVC, Logistic Regression etc
Confusion metric is used to evaluate how much the model has predicted correctly, It combine true label and predicted label and gave its evaluation in four ways:
Cross-validation is primarily used in applied machine learning to estimate the skill of a machine learning model on unseen data. That is, to use a limited sample in order to estimate how the model is expected to perform in general when used to make predictions on data not used during the training of the model.
Cross-validation procedure has a single parameter called k that refers to the number of groups that a given data sample is to be split into. As such, the procedure is often called k-fold cross-validation.
In addition to this method we use ROC curve/AUC in order to evaluate the model quality.
AUC stands for "Area under the ROC Curve." That is, AUC measures the entire two-dimensional area underneath the entire ROC curve,AUC (Area under the ROC Curve).
AUC provides an aggregate measure of performance across all possible classification thresholds. So, we are looking for higher AUC that indicates a better model.
Under-fitting and over-fitting are two problems of machine learning. A model usually underperforms due to one of these reasons.
Under fitting happens when the model is too simple i.e. it contains less features to be trained or is regularized too much that the model couldn’t learn anything from the dataset which leads to less variance and too much biasness in predicting wrong outcomes. While on other hand over-fitting in models occur when they are trained so much on the training data that they eventually fail on providing a good prediction on any general unseen dataset (test data).
Both of these issues do not have any fixed solution but can be prevented through number of ways which are implemented in our model i.e:
test_pred
We choose "Random Forest" model, because it gave us the highest AUC(=92) which indicates to better prediction.
Choosing Random Forest predictions to submit on our output file:
test_pred['RF Prediction']
test_pred['RF Prediction'].to_csv("submission_group_12.csv")
These models we tryed during making the project, they were excluded because of their low AUC.
All codes are written in Markdown and their not a part of the workflow.
Parameters:
priors=None
,var_smoothing=1e-09
Code:
nb = GaussianNB()
tprs = []
aucs = []
mean_fpr=np.linspace(0,1,100)
i=1
fig1 = plt.figure(figsize=[12,12])
cv = KFold(n_splits=5, random_state=7, shuffle=True)
for train_index, test_index in cv.split(x):
X_train = x.iloc[train_index]
X_test = x.iloc[test_index]
y_train = y.iloc[train_index]
y_test = y.iloc[test_index]
nb.fit(X_train, y_train) # Run Models
prediction = nb.predict_proba(X_test)
fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
tprs.append(np.interp(mean_fpr, fpr, tpr))
roc_auc = auc(fpr, tpr)
aucs.append(roc_auc)
print("Training Data Accuracy:", nb.score(X_train,y_train)100)
print("Test Data Accuracy:", nb.score(X_test,y_test)100)
plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='blue',
label=r'Mean ROC (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
plt.text(0.32,0.7,'More accurate area',fontsize = 12)
plt.text(0.63,0.4,'Less accurate area',fontsize = 12)
plt.show()
In This Model:
Code:
models(GaussianNB(), X_train, X_test, y_train, y_test,test_df)
Parameters:
ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort='deprecated', random_state=None, splitter='best'
Code:
dt = DecisionTreeClassifier()
scores = []
y_preds =[]
tprs = []
aucs = []
mean_fpr=np.linspace(0,1,100)
i=1
fig1 = plt.figure(figsize=[12,12])
cv = KFold(n_splits=3, random_state=47, shuffle=True)
for train_index, test_index in cv.split(x):
X_train = x.iloc[train_index]
X_test = x.iloc[test_index]
y_train = y.iloc[train_index]
y_test = y.iloc[test_index]
dt.fit(X_train, y_train) # Run Models
prediction = dt.predict_proba(X_test)
fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
tprs.append(np.interp(mean_fpr, fpr, tpr))
roc_auc = auc(fpr, tpr)
aucs.append(roc_auc)
scores.append(dt.score(X_test, y_test))
print("Training Data Accuracy:", dt.score(X_train,y_train)100)
print("Test Data Accuracy:", dt.score(X_test,y_test)100)
y_preds.append(dt.predict(X_test))
plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='blue',
label=r'Mean ROC (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
plt.text(0.32,0.7,'More accurate area',fontsize = 12)
plt.text(0.63,0.4,'Less accurate area',fontsize = 12)
plt.show()
In This Model:
Code:
models(DecisionTreeClassifier(), X_train, X_test, y_train, y_test,test_df)
Parameters:
C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False
Code:
svc = SVC(probability=True)
scores = []
y_preds =[]
tprs = []
aucs = []
mean_fpr=np.linspace(0,1,100)
i=1
fig1 = plt.figure(figsize=[12,12])
cv = KFold(n_splits=3, random_state=47, shuffle=True)
for train_index, test_index in cv.split(x):
X_train = x.iloc[train_index]
X_test = x.iloc[test_index]
y_train = y.iloc[train_index]
y_test = y.iloc[test_index]
svc.fit(X_train, y_train) # Run Models
prediction = svc.predict_proba(X_test)
fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
tprs.append(np.interp(mean_fpr, fpr, tpr))
roc_auc = auc(fpr, tpr)
aucs.append(roc_auc)
scores.append(dt.score(X_test, y_test))
print("Training Data Accuracy:", svc.score(X_train,y_train)100)
print("Test Data Accuracy:", svc.score(X_test,y_test)100)
y_preds.append(svc.predict(X_test))
plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='blue',
label=r'Mean ROC (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
plt.text(0.32,0.7,'More accurate area',fontsize = 12)
plt.text(0.63,0.4,'Less accurate area',fontsize = 12)
plt.show()
In This Model:
Code:
models(SVC(probability=True), X_train, X_test, y_train, y_test,test_df)
All The visulations and steps are explained also in the report.